{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import xgboost as xgb\n",
    "import sys,random\n",
    "import pickle\n",
    "import os\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import modin.pandas as pd\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.exists('featurescore'):\n",
    "    os.mkdir('featurescore')\n",
    "if not os.path.exists('model'):\n",
    "    os.mkdir('model')\n",
    "if not os.path.exists('preds'):\n",
    "    os.mkdir('preds')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "使用旧数据集 train集的维度为：  \n",
    "使用新数据集 train集的维度为：6702"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 加载训练数据(无tag)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 55.8 s, sys: 3.56 s, total: 59.3 s\n",
      "Wall time: 1min 1s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "data_date = pd.read_csv('../../preprocess_data/train_x_date.csv').drop(columns=['id','loan_hour'])\n",
    "data_raw = pd.read_csv('../../preprocess_data_new/train_ax_nodup.csv',nrows=33465).drop(columns=['id','loan_dt','tag'])\n",
    "data_null = pd.read_csv('../../preprocess_data_new/train_ax_row_null.csv',nrows=33465).drop(columns=['id'])\n",
    "data_label = pd.read_csv('../../preprocess_data/train_y_33465.csv',usecols=['label'])\n",
    "data = pd.concat([data_date,data_raw,data_null],axis=1)\n",
    "x_train = data.fillna(-1).values\n",
    "y_train = data_label.values.ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(33465, 6701)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 加载unlabel数据(无tag）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1min 55s, sys: 9.84 s, total: 2min 5s\n",
      "Wall time: 2min 11s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "unlabel_date = pd.read_csv('../../preprocess_data/unlabel_x_date.csv').drop(columns=['id','loan_hour'])\n",
    "unlabel_raw = pd.read_csv('../../preprocess_data_new/train_ax_nodup.csv',skiprows=range(1,33466)).drop(columns=['id','loan_dt','tag'])\n",
    "unlabel_null = pd.read_csv('../../preprocess_data_new/train_ax_row_null.csv',skiprows=range(1,33466)).drop(columns=['id'])\n",
    "unlabel = pd.concat([unlabel_date,unlabel_raw,unlabel_null],axis=1)\n",
    "x_test = unlabel.fillna(-1).values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(66535, 6701)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 定义学习器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def SelectModel(model_name):\n",
    "    if model_name == 'GBC':\n",
    "        from sklearn.ensemble import GradientBoostingClassifier\n",
    "        model = GradientBoostingClassifier(loss='deviance',\n",
    "                                           learning_rate =0.1,\n",
    "                                           n_estimators=300,\n",
    "                                           subsample=0.9,\n",
    "                                           max_depth=3,\n",
    "#                                            verbose=1,\n",
    "                                          random_state=2018)\n",
    "    elif model_name == 'XGB':\n",
    "        from xgboost import XGBClassifier\n",
    "\n",
    "        model = XGBClassifier(max_depth=6,\n",
    "                              learning_rate =0.04, \n",
    "                              booster='gbtree',\n",
    "                              objective='binary:logistic',\n",
    "                              early_stopping_rounds=100,\n",
    "                              scale_pos_weight=float(len(y_train)-np.sum(y_train))/float(np.sum(y_train)),\n",
    "                              eval_metric='auc',\n",
    "                              gamma=1,\n",
    "                              reg_lambda=1,\n",
    "                              subsample=0.9,\n",
    "                              min_child_weight=1,\n",
    "                              seed=2018,\n",
    "                              silent=False,\n",
    "                              n_jobs=24,\n",
    "                              num_boost_round =300\n",
    "                             )\n",
    "    elif model_name == 'RFC':\n",
    "        from sklearn.ensemble import RandomForestClassifier\n",
    "        model = RandomForestClassifier(n_estimators=1500,\n",
    "                                       n_jobs =36,\n",
    "                                       max_features='sqrt',\n",
    "                                       class_weight='balanced',\n",
    "#                                        verbose =1,\n",
    "                                       random_state=2018)\n",
    "    elif model_name == 'LGB':\n",
    "        from lightgbm import LGBMClassifier\n",
    "        model = LGBMClassifier(boost='gbdt',\n",
    "                    num_leaves=135, \n",
    "                    scale_pos_weight=float(len(y_train)-np.sum(y_train.ravel()))/float(np.sum(y_train.ravel())),\n",
    "                    max_depth=-1,\n",
    "                    learning_rate=.05,\n",
    "                    max_bin=200,\n",
    "                    min_data_in_leaf= 60,\n",
    "                    objective='binary',\n",
    "                    metric='auc',\n",
    "                    num_threads=32,\n",
    "                    slient=False,\n",
    "                    num_boost_round =300)\n",
    "    else:\n",
    "        pass\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "XGB\n",
      "RFC\n",
      "LGB\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/anaconda3/lib/python3.6/site-packages/lightgbm/engine.py:102: UserWarning: Found `num_boost_round` in params. Will use it instead of argument\n",
      "  warnings.warn(\"Found `{}` in params. Will use it instead of argument\".format(alias))\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "import joblib\n",
    "model_list = ['XGB','RFC','LGB']\n",
    "unlabel_pred_list = []\n",
    "\n",
    "for model in model_list:\n",
    "    print(model)\n",
    "    clf = SelectModel(model)\n",
    "    clf.fit(x_train,y_train)\n",
    "    joblib.dump(clf,'./model/{}'.format(model))  \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import joblib\n",
    "import os\n",
    "model_list = os.listdir('./model')\n",
    "clf = joblib.load('./model/{}'.format(model_list[0]))\n",
    "unlabel_pred = clf.predict_proba(x_test)[:,1]\n",
    "\n",
    "for model in model_list[1:]:\n",
    "    clf = joblib.load('./model/{}'.format(model))\n",
    "    unlabel_pred += clf.predict_proba(x_test)[:,1]\n",
    "unlabel_pred = unlabel_pred/len(model_list)\n",
    "# 输出\n",
    "unlabel_id = pd.read_csv('../../preprocess_data/unlabel_x_date.csv',usecols=['id']).values.ravel()\n",
    "pred = pd.DataFrame({'id':unlabel_id,'prob':unlabel_pred})\n",
    "pred.to_csv('./preds/unlabel_pred.csv',index=False,encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
